ifelse(!dir.exists(outdir), dir.create(outdir), FALSE)
[1] TRUE
model.cca <- readRDS("~/models/modelCCA_reference_hvg_PBMC_SCElist_20191105.RDS")
model.liger <- readRDS("~/models/modelLiger_reference_hvg_PBMC_SCElist_20191105.RDS")
model.conos <- readRDS("~/models/modelConos_reference_hvg_PBMC_SCElist_20191105.RDS")
seu.cca <- readRDS("~/models/labelTransferCCA_reference_hvg_PBMC_SCElist_20191105.RDS")
# seu.cca.union <- readRDS("~/models/labelTransferCCA_PBMC_SCElist_20191105.RDS")
seu.liger <- readRDS("~/models/labelTransferLiger_reference_hvg_PBMC_SCElist_20191105.RDS")
seu.conos <- readRDS("~/models/labelTransferConos_reference_hvg_PBMC_SCElist_20191105.RDS")
integrate_features <- model.cca$model@anchor.features
int.list <- list(CCA=seu.cca, Liger=seu.liger, Conos=seu.conos)
## Make method color palette
method.palette <- brewer_palette_4_values(names(int.list), "Set1")
Visualize label transfer on original ATAC data (embedded SnapATAC bins)
## Load original data
orig.ATAC <- readRDS("~/my_data/10X_data/atac_pbmc_10k_nextgem.snapATAC.RDS")
sce.list <- readRDS("~/my_data/10X_data/PBMC_SCElist_20191105.RDS")
orig.RNA <- sce.list$RNA
Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply,
parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB
The following objects are masked from ‘package:dplyr’:
combine, intersect, setdiff, union
The following object is masked from ‘package:Matrix’:
which
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname, do.call, duplicated,
eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply, match,
mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rownames,
sapply, setdiff, sort, table, tapply, union, unique, unsplit, which, which.max, which.min
Loading required package: S4Vectors
Attaching package: ‘S4Vectors’
The following objects are masked from ‘package:dplyr’:
first, rename
The following object is masked from ‘package:tidyr’:
expand
The following object is masked from ‘package:Matrix’:
expand
The following object is masked from ‘package:base’:
expand.grid
Loading required package: IRanges
Attaching package: ‘IRanges’
The following objects are masked from ‘package:dplyr’:
collapse, desc, slice
The following object is masked from ‘package:purrr’:
reduce
Loading required package: GenomeInfoDb
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor,
see 'citation("Biobase")', and for packages 'citation("pkgname")'.
Loading required package: DelayedArray
Loading required package: matrixStats
Attaching package: ‘matrixStats’
The following objects are masked from ‘package:Biobase’:
anyMissing, rowMedians
The following object is masked from ‘package:dplyr’:
count
Loading required package: BiocParallel
Attaching package: ‘DelayedArray’
The following objects are masked from ‘package:matrixStats’:
colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
The following object is masked from ‘package:purrr’:
simplify
The following objects are masked from ‘package:base’:
aperm, apply, rowsum
Attaching package: ‘SummarizedExperiment’
The following object is masked from ‘package:Seurat’:
Assays
## Make SeuratObjects
atac.seu <- snapToSeurat(
obj=orig.ATAC,
eigs.dims=1:20,
norm=TRUE,
scale=TRUE
)
Epoch: checking input parameters ...
Non-unique features (rownames) present in the input matrix, making uniqueFeature names cannot have underscores ('_'), replacing with dashes ('-')Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Centering and scaling data matrix
|
| | 0%
|
|== | 2%
|
|==== | 4%
|
|===== | 5%
|
|======= | 7%
|
|========= | 9%
|
|=========== | 11%
|
|============= | 12%
|
|============== | 14%
|
|================ | 16%
|
|================== | 18%
|
|==================== | 19%
|
|====================== | 21%
|
|======================= | 23%
|
|========================= | 25%
|
|=========================== | 26%
|
|============================= | 28%
|
|=============================== | 30%
|
|================================= | 32%
|
|================================== | 33%
|
|==================================== | 35%
|
|====================================== | 37%
|
|======================================== | 39%
|
|========================================== | 40%
|
|=========================================== | 42%
|
|============================================= | 44%
|
|=============================================== | 46%
|
|================================================= | 47%
|
|=================================================== | 49%
|
|==================================================== | 51%
|
|====================================================== | 53%
|
|======================================================== | 54%
|
|========================================================== | 56%
|
|============================================================ | 58%
|
|============================================================= | 60%
|
|=============================================================== | 61%
|
|================================================================= | 63%
|
|=================================================================== | 65%
|
|===================================================================== | 67%
|
|====================================================================== | 68%
|
|======================================================================== | 70%
|
|========================================================================== | 72%
|
|============================================================================ | 74%
|
|============================================================================== | 75%
|
|================================================================================ | 77%
|
|================================================================================= | 79%
|
|=================================================================================== | 81%
|
|===================================================================================== | 82%
|
|======================================================================================= | 84%
|
|========================================================================================= | 86%
|
|========================================================================================== | 88%
|
|============================================================================================ | 89%
|
|============================================================================================== | 91%
|
|================================================================================================ | 93%
|
|================================================================================================== | 95%
|
|=================================================================================================== | 96%
|
|===================================================================================================== | 98%
|
|=======================================================================================================| 100%
atac.seu <- RenameCells(atac.seu, new.names = orig.ATAC@metaData$barcode)
## Add cell type predictions
getPredictedLabels <- function(seu.int, int.name, id.col="predicted.id", score.col="score"){
pred.df <- seu.int$ATAC@meta.data[,c(id.col, score.col), drop=F]
rownames(pred.df) <- str_remove(rownames(pred.df), "^ATAC_")
colnames(pred.df) <- c(str_c("predicted.id", "_", int.name), str_c("score", "_", int.name))
pred.df
}
pred.cca <- getPredictedLabels(seu.cca, "CCA", score.col = "prediction.score.max")
pred.liger <- getPredictedLabels(seu.liger, "Liger")
pred.conos <- getPredictedLabels(seu.conos, "Conos")
# pred.cca.union <- getPredictedLabels(seu.cca.union, "CCA.union", score.col = "prediction.score.max")
# cbind(pred.cca, pred.cca.union)
if (all(rownames(pred.conos) == rownames(pred.cca)) & all(rownames(pred.conos) == rownames(pred.liger))) {
atac.seu <- AddMetaData(atac.seu, metadata = cbind(pred.cca, pred.liger, pred.conos))
} else {
stop("Non corresponding cell names")
}
plotly::ggplotly(pl)
geom_GeomTextRepel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issues
orig.RNA.seu <- as.Seurat(orig.RNA)
orig.RNA.seu <- FindVariableFeatures(orig.RNA.seu)
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
orig.RNA.seu <- ScaleData(orig.RNA.seu)
Centering and scaling data matrix
|
| | 0%
|
|========================================================== | 50%
|
|====================================================================================================================| 100%
orig.RNA.seu <- RunPCA(orig.RNA.seu)
PC_ 1
Positive: LTB, CD3E, TRAC, CD3D, TRBC2, LCK, CD3G, IL32, ETS1, IL7R
TCF7, CD247, CD27, CD7, BCL11B, CD69, CD2, ISG20, SPOCK2, TRBC1
GZMM, ARL4C, LAT, FCMR, CCR7, RORA, SYNE2, RASGRP1, LINC00861, LEF1
Negative: CSTA, MNDA, SERPINA1, FGL2, FCN1, NCF2, CPVL, CD68, MPEG1, CST3
MS4A6A, CD14, CLEC7A, CD36, AC020656.1, SLC7A7, VCAN, CSF3R, TGFBI, CFD
TNFAIP2, CYBB, KLF4, SPI1, GRN, S100A12, TNFSF13B, KCTD12, CFP, TIMP2
PC_ 2
Positive: ANXA1, IL32, CD3E, CD247, S100A4, LCK, TMSB4X, GZMM, CD7, CD3D
S100A10, CD3G, ITGB2, TRBC1, TRAC, CD2, IL7R, LAT, KLRB1, RORA
GZMA, ZAP70, CTSW, GAPDH, NEAT1, BCL11B, CST7, S100A6, PRF1, SAMD3
Negative: IGHM, BANK1, CD79A, SPIB, FAM129C, MS4A1, CD22, IGHD, TSPAN13, LINC00926
BCL11A, HLA-DQA1, VPREB3, TNFRSF13C, TCL1A, BLNK, BLK, RALGPS2, COBLL1, JCHAIN
MZB1, HLA-DQB1, FCRL5, FCRLA, HLA-DQA2, CD79B, AFF3, FCRL1, TCF4, FCRL2
PC_ 3
Positive: LTB, CCR7, IL7R, FOSB, MAL, TRABD2A, LEF1, CD3G, TCF7, CD3D
CD27, VIM, SLC2A3, COTL1, TRAC, BIRC3, CAMK4, TSHZ2, PASK, NOSIP
ZFP36L2, FHIT, FOS, NCF1, INPP4B, MYC, MS4A1, NELL2, TNFRSF13C, ADTRP
Negative: GZMB, CLIC3, KLRF1, KLRD1, NKG7, PRF1, GNLY, SPON2, CST7, FGFBP2
TRDC, GZMA, ADGRG1, CCL4, C12orf75, GZMH, MATK, TBX21, PTGDR, CCL5
S1PR5, HOPX, CD160, AKR1C3, PTCRA, FCGR3A, TTC38, RHOC, MYOM2, SLAMF7
PC_ 4
Positive: GZMB, KLRD1, CYBA, KLRF1, PRF1, CLIC3, NKG7, FGFBP2, SPON2, GNLY
CST7, MT-CO1, GZMA, CCL4, ADGRG1, TRDC, HOPX, MATK, GZMH, TBX21
FCGR3A, S1PR5, CD160, MYOM2, AKR1C3, TTC38, IL2RB, XCL2, APOBEC3G, SH2D1B
Negative: TUBB1, GP9, CMTM5, CLDN5, TREML1, TMEM40, CTTN, CAVIN2, CLEC1B, ITGA2B
PF4, GNG11, C2orf88, ACRBP, AC147651.1, HIST1H2BJ, CLU, ARHGAP6, SH3BGRL2, GMPR
AP001189.1, PDZK1IP1, GNAZ, AC090409.1, SMOX, SPARC, MPIG6B, ESAM, SCN1B, MYL9
PC_ 5
Positive: SCT, CLEC4C, LILRA4, LRRC26, TNFRSF21, SCAMP5, PHEX, KCNK17, PACSIN1, SCN9A
SMPD3, DNASE1L3, CYP46A1, RHEX, TPM2, MAP1A, SERPINF1, SHD, LAMP5, IL3RA
LINC00996, PPM1J, ASIP, AL096865.1, AC097375.1, SMIM5, CIB2, GAS6, ZFAT, CUX2
Negative: MS4A1, LINC00926, CD22, CD79A, BANK1, PDLIM1, CD79B, IGHD, VPREB3, TNFRSF13C
KLRF1, FGFBP2, KLRD1, FCRL1, RALGPS2, ADGRG1, FCER2, GNLY, FCRL2, TRDC
CCL4, HOPX, PAX5, CD72, FCRL5, PRF1, CST7, ADAM28, PTGDR, SWAP70
orig.RNA.seu <- RunUMAP(orig.RNA.seu, dims=1:30)
15:52:11 UMAP embedding parameters a = 0.9922 b = 1.112
15:52:12 Read 5607 rows and found 30 numeric columns
15:52:12 Using Annoy for neighbor search, n_neighbors = 30
15:52:12 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:52:13 Writing NN index file to temp file /tmp/RtmpXEhLFb/file9c191c14dd
15:52:13 Searching Annoy index using 1 thread, search_k = 3000
15:52:14 Annoy recall = 100%
15:52:16 Commencing smooth kNN distance calibration using 1 thread
15:52:18 Initializing from normalized Laplacian + noise
15:52:19 Commencing optimization for 500 epochs, with 240156 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:52:33 Optimization finished
DimPlot(orig.RNA.seu, group.by="annotation", label = TRUE)
Quantifies the uncertainty of the prediction. Calculated differently for every method, but used to define which cells are “unassigned”.
ggarrange(predict_score_hist, predict_score_cumedist, common.legend = TRUE, widths = c(0.8, 1.2),
labels=c("A", "B")) +
ggsave(paste0(outdir, "prediction_score_distribution.pdf"), height = 6, width = 10)
Removed 180 rows containing non-finite values (stat_ecdf).
Compare cell type fractions (w uncertainty)
pred.labels.df %>%
group_by(method) %>%
drop_na() %>%
mutate(tot.cells=n()) %>%
ungroup() %>%
group_by(method, predicted.id) %>%
summarise(tot.label = n(), tot.cells = max(tot.cells), mean.score=mean(score)) %>%
mutate(frac.label=tot.label/tot.cells) %>%
# bind_rows(orig.frac.df) %>%
ggplot(aes(method, predicted.id)) +
geom_point(aes(color=mean.score, size=frac.label)) +
scale_color_continuous() +
scale_color_gradient(low ="grey", high="blue") +
theme_classic(base_size = 16)
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
Does the uncertainty depend on the size of the cluster?
original size?
pred.labels.df %>%
group_by(method) %>%
drop_na() %>%
mutate(tot.cells=n()) %>%
ungroup() %>%
group_by(method, predicted.id) %>%
summarise(tot.label = n(), tot.cells = max(tot.cells), mean.score=median(score), sd.score=mad(score)) %>%
mutate(frac.label=tot.label/tot.cells) %>%
left_join(select(orig.frac.df, predicted.id, frac.label), by="predicted.id", suffix=c(".int",".original")) %>%
# bind_rows(orig.frac.df) %>%
ggplot(aes(frac.label.original, mean.score, color=method)) +
geom_point(size=2) +
geom_errorbar(aes(ymin=mean.score-sd.score, ymax=mean.score+sd.score), alpha=0.6) +
scale_color_brewer(palette="Set1") +
facet_grid(.~method) +
theme_bw(base_size = 16)
Column `predicted.id` joining character vector and factor, coercing into character vector
dodge <- position_dodge(width=0.9)
pred.labels.df %>%
group_by(method, predicted.id) %>%
summarise(score.mean=mean(score, na.rm=F), score.cv = sd(score)/mean(score), n=n()) %>%
# ggplot(aes(predicted.id, score.mean, fill=method)) +
# geom_col( position = dodge) +
ggplot(aes(predicted.id, method)) +
geom_point(aes(size=score.mean, color=score.mean)) +
scale_color_gradient(low="white", high = "blue") +
# geom_errorbar(aes(ymin=score.mean-score.cv, ymax=score.mean+score.cv), position=dodge, width=0.25) +
coord_flip() +
theme_bw(base_size = 16)
pred.labels.df %>%
ggplot(aes(predicted.id, score, color=method)) +
geom_boxplot() +
coord_flip() +
scale_color_brewer(palette="Set1")
Calculate which fractions of NNs in bin based graph of ATAC cells have the same annotation
full_join(pred.labels.df, knn_score_df) %>%
ggplot(aes(KNN_score, color=method)) +
stat_ecdf() +
facet_wrap("predicted.id") +
scale_color_brewer(palette = "Set1") +
coord_fixed()
Joining, by = c("cell", "method")
map(cell.types, ~ ggarrange(plot_KNNecdf(.x), UMAPs_cluster(.x), nrow = 2, heights = c(1,0.8)))
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
[[10]]
[[11]]
[[12]]
[[13]]